#!/usr/bin/env python
# _*_ coding:utf-8 _*_
#
# @Version : 1.0
# @Time    : 2018/9/10
# @Author  : 圈圈烃
# @File    : Sougou_Spider
# @Description: 搜狗词库爬虫
#
#
from bs4 import BeautifulSoup
from urllib.parse import unquote
import requests
import re
import os


class SougouSpider:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
    }
    res = None

    def __init__(self, url):
        self.url = url

    def get_html(self, open_proxy=False, ip_proxies=None):
        max_retries = 3
        for retry in range(max_retries):
            try:
                pattern = re.compile(r'//(.*?)/')
                host_url = pattern.findall(self.url)[0]
                SougouSpider.headers["Host"] = host_url
                if open_proxy:  # 判断是否开启代理
                    proxies = {"http": "http://" + ip_proxies, }  # 设置代理，例如{"http": "http://103.109.58.242:8080  ", }
                    SougouSpider.res = requests.get(self.url, headers=SougouSpider.headers, proxies=proxies, timeout=10)
                else:
                    SougouSpider.res = requests.get(self.url, headers=SougouSpider.headers, timeout=10)
                SougouSpider.res.encoding = SougouSpider.res.apparent_encoding  # 自动确定html编码
                print("Html页面获取成功 " + self.url)
                return SougouSpider.res  # 只返回页面的源码
            except Exception as e:
                if retry < max_retries - 1:
                    print(f"Html页面获取失败，正在重试 ({retry + 1}/{max_retries}) " + self.url)
                    print(e)
                else:
                    print("Html页面获取失败，已达到最大重试次数 " + self.url)
                    print(e)
        return None

    def get_cate_1_list(self):
        # 获取大分类链接
        if SougouSpider.res is None:
            print("未成功获取HTML页面，无法获取大分类链接")
            return []
        dict_cate_1_urls = []
        soup = BeautifulSoup(SougouSpider.res.text, "html.parser")
        dict_nav = soup.find("div", id="dict_nav_list")
        dict_nav_lists = dict_nav.find_all("a")
        for dict_nav_list in dict_nav_lists:
            dict_nav_url = "https://pinyin.sogou.com" + dict_nav_list['href'].strip()
            dict_cate_1_urls.append(dict_nav_url)
        return dict_cate_1_urls

    def get_cate_2_1_list(self):
        # 获取第一种小分类链接
        if SougouSpider.res is None:
            print("未成功获取HTML页面，无法获取第一种小分类链接")
            return {}
        dict_cate_2_1_dict = {}
        soup = BeautifulSoup(SougouSpider.res.text, "html.parser")
        dict_td_lists = soup.find_all("div", class_="cate_no_child citylistcate no_select")
        for dict_td_list in dict_td_lists:
            dict_td_url = "https://pinyin.sogou.com" + dict_td_list.a['href'].strip()
            dict_cate_2_1_dict[dict_td_list.get_text().replace("\n", "")] = dict_td_url
        return dict_cate_2_1_dict

    def get_cate_2_2_list(self):
        # 获取第二种小分类链接
        if SougouSpider.res is None:
            print("未成功获取HTML页面，无法获取第二种小分类链接")
            return {}
        dict_cate_2_2_dict = {}
        soup = BeautifulSoup(SougouSpider.res.text, "html.parser")
        dict_td_lists = soup.find_all("div", class_="cate_no_child no_select")
        # 类型1解析
        for dict_td_list in dict_td_lists:
            dict_td_url = "https://pinyin.sogou.com" + dict_td_list.a['href'].strip()
            dict_cate_2_2_dict[dict_td_list.get_text().replace("\n", "")] = dict_td_url
        # 类型2解析
        dict_td_lists = soup.find_all("div", class_="cate_has_child no_select")
        for dict_td_list in dict_td_lists:
            dict_td_url = "https://pinyin.sogou.com" + dict_td_list.a['href'].strip()
            dict_cate_2_2_dict[dict_td_list.get_text().replace("\n", "")] = dict_td_url
        return dict_cate_2_2_dict

    def get_page(self):
        # 页数
        if SougouSpider.res is None:
            print("未成功获取HTML页面，无法获取页数")
            return 1
        soup = BeautifulSoup(SougouSpider.res.text, "html.parser")
        dict_div_lists = soup.find("div", id="dict_page_list")
        if dict_div_lists is None:
            return 1
        dict_td_lists = dict_div_lists.find_all("a")
        if len(dict_td_lists) < 2:
            return 1
        page = dict_td_lists[-2].string
        return int(page)

    def get_download_list(self):
        # 获取当前页面的下载链接
        if SougouSpider.res is None:
            print("未成功获取HTML页面，无法获取下载链接")
            return {}
        dict_dl_dict = {}
        pattern = re.compile(r'name=(.*)')
        soup = BeautifulSoup(SougouSpider.res.text, "html.parser")
        dict_dl_lists = soup.find_all("div", class_="dict_dl_btn")
        for dict_dl_list in dict_dl_lists:
            dict_dl_url = dict_dl_list.a['href']
            dict_name = pattern.findall(dict_dl_url)[0]
            dict_ch_name = unquote(dict_name, 'utf-8').replace("/", "-").replace(",", "-").replace("|", "-") \
                .replace("\\", "-").replace("'", "-")
            dict_dl_dict[dict_ch_name] = dict_dl_url
        return dict_dl_dict

    def download_dict(self, dl_url, path):
        # 下载
        max_retries = 3
        for retry in range(max_retries):
            try:
                pattern = re.compile(r'//(.*?)/')
                host_url = pattern.findall(dl_url)[0]
                SougouSpider.headers["Host"] = host_url
                proxies = {"http": "http://117.127.0.196:80", }  # 设置代理，例如{"http": "http://103.109.58.242:8080  ", }
                res = requests.get(dl_url, headers=SougouSpider.headers, proxies=proxies, timeout=10)
                with open(path, "wb") as fw:
                    fw.write(res.content)
                print("文件下载成功: " + path)
                return
            except Exception as e:
                if retry < max_retries - 1:
                    print(f"文件下载失败，正在重试 ({retry + 1}/{max_retries}) " + path)
                    print(e)
                else:
                    print("文件下载失败，已达到最大重试次数 " + path)
                    print(e)


def main():
    url = "https://pinyin.sogou.com/dict/cate/index/436"
    save_dir = r"D:\Users\Hasee\Desktop\工作\电子游戏"
    dirnames = ['城市信息', '自然科学', '社会科学', '工程应用', '农林渔畜', '医学医药',
                '电子游戏', '艺术设计', '生活百科', '运动休闲', '人文科学', '娱乐休闲']
    for dirname in dirnames:
        try:
            os.mkdir(os.path.join(save_dir, dirname))
        except Exception as e:
            print(e)
    # 获取大类链接
    mysougou = SougouSpider(url)
    mysougou.get_html()
    dict_cate_1_urls = mysougou.get_cate_1_list()
    count = 0
    # 大类分类
    for dict_cate_1_url in dict_cate_1_urls:
        # 创建保存路径
        save_dir_1 = os.path.join(save_dir, dirnames[count])  # 大类文件保存路径
        count += 1
        # 获取小类链接
        mysougou.url = dict_cate_1_url
        mysougou.get_html()
        if dict_cate_1_url == "https://pinyin.sogou.com/dict/cate/index/167":
            dict_cate_2_dict = mysougou.get_cate_2_1_list()
        else:
            dict_cate_2_dict = mysougou.get_cate_2_2_list()
        # 小类分类
        for key in dict_cate_2_dict:
            # 创建保存路径
            save_dir_2 = os.path.join(save_dir_1, key)  # 小类文件保存路径
            try:
                os.mkdir(save_dir_2)
            except Exception as e:
                print(e)
            print(save_dir_2)
            try:
                mysougou.url = dict_cate_2_dict[key]
                mysougou.get_html()
                pages = mysougou.get_page()
            except Exception as e:
                print(e)
                pages = 1
            # 获取下载链接
            for page in range(1, pages + 1):
                page_url = dict_cate_2_dict[key] + "/default/" + str(page)
                mysougou.url = page_url
                mysougou.get_html()
                dict_dl_dict = mysougou.get_download_list()
                # 下载咯
                for name in dict_dl_dict:
                    save_path = os.path.join(save_dir_2, name + ".scel")
                    if os.path.exists(save_path):
                        print(name + ">>>>>>文件已存在")
                    else:
                        dl_url = dict_dl_dict[name]
                        mysougou.download_dict(dl_url, save_path)


if __name__ == '__main__':
    main()
